import scrapy
import time
import json
from bsws_spider.items import BlogItem
class CnblogsSpider(scrapy.Spider):
    name='cnblogs'
    allowed_domains=['cnblogs.com']# 允许的域名范围
    start_urls = [f'https://www.cnblogs.com/sitehome/p/{i}' for i in range(1, 201)]# 起始地址
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.63 Safari/537.36 Qiyu/2.1.1.1',
        'referer': 'https://www.cnblogs.com/',
        'cookie': '__gads=ID=9438c1cd86c17866:T=1543647755:S=ALNI_Mbi3TShe973wtyvo52ld2uDpxPKaA; _ga=GA1.2.1162600031.1543647753; _gid=GA1.2.1552287033.1557489053'
    }
    # def start_requests(self):
    #     for i in range(1, 201):
    #         return scrapy.Request(f'https://www.cnblogs.com/sitehome/p/{i}',callback=self.crawl)
    # 首页方法,获取详细页面的链接
    def parse(self,response):
        urls=response.xpath('//a[@class="titlelnk"]/@href').extract()
        for url in urls:
            yield scrapy.Request(url,callback=self.parse_info)

    # 详细页面的数据抓取
    def parse_info(self,response):
        data_dict=BlogItem()# 实例化容器
        data_dict['href']=response.url
        data_dict['source']='博客园'
        data_dict['title']=response.xpath('//a[@id="cb_post_title_url"]/text()').extract_first()
        info_list = response.xpath('string(//div/small|//div[@class="postDesc"]|//*[@class="postfoot"])').extract_first().strip().split()
        if 'by' in info_list:
            data_dict['fbrq'] = info_list[0]
            data_dict['author'] = info_list[3]
        else:
            if not info_list:
                return data_dict
            data_dict['fbrq'] = info_list[2]
            data_dict['author'] = info_list[4]
        #如下内容都需要通过动态请求获取requests
        script_text = response.xpath('//script[2]/text()').extract_first()
        # blogId 需要到页面中的js中去提取
        blogId = script_text.split(';')[0].split('=')[-1]
        blogApp = script_text.split(';')[1].split(',')[0].split('=')[-1].strip("'")
        postId = response.url.split('/')[-1].split('.')[0]
        _ = int(time.time() * 1000)  # 1556110169047\
        more_info_url= f'https://www.cnblogs.com/mvc/blog/CategoriesTags.aspx?blogApp={blogApp}&blogId={blogId}&postId={postId}&_={_}'
        data_dict['contents'] = response.xpath('//*[@id="cnblogs_post_body"]').extract_first()

        request_list=[scrapy.Request(more_info_url,callback=self.parse_more_info,meta={'data':data_dict,'blogId':blogId,'blogApp':blogApp,'postId':postId,'_':_},headers=self.headers)]
        new_url = response.xpath('//*[@id="Header1_HeaderTitle"]/@href').extract_first()
        if new_url:
            request_list.append(scrapy.Request(new_url,callback=self.parse_author_word))
        return request_list

    # 对作者的博文列表进行抓取
    def parse_author_word(self,response):
        word_list_urls=response.xpath('//div[@class="postTitle"]/a/@href').extract()# 文章详细页面的地址
        for url in word_list_urls:
            yield scrapy.Request(url,callback=self.parse_info)
        # 下一页的链接的抓取
        nav_next_page = response.xpath('//*[@id="nav_next_page"]/@href').extract_first()
        if nav_next_page:
            return scrapy.Request(nav_next_page,callback=self.parse_author_word)

    # 更多内容(分类和标签)的数据抓取
    def parse_more_info(self,response):
        result=response.meta.get('data',{})
        blogId=response.meta.get('blogId')
        blogApp=response.meta.get('blogApp')
        postId=response.meta.get('postId')
        _=response.meta.get('_')
        res=response.text
        try:
            res=json.loads(res)
            result['category'] = ','.join(scrapy.Selector(text=res['Categories']).xpath('//a/text()').extract())
            result['labels'] = ','.join(scrapy.Selector(text=res['Tags']).xpath('//a/text()').extract())
            return scrapy.Request(f'https://www.cnblogs.com/mvc/blog/ViewCountCommentCout.aspx?postId={postId}',headers=self.headers, callback=self.parse_read_num, meta={'data': result,'blogId':blogId,'blogApp':blogApp,'postId':postId,'_':_})
        except json.decoder.JSONDecodeError:
            return result

    # 更多内容(阅读数)的数据抓取
    def parse_read_num(self,response):
        result = response.meta.get('data', {})
        blogId = response.meta.get('blogId')
        _=response.meta.get('_')
        blogApp = response.meta.get('blogApp')
        postId = response.meta.get('postId')
        result['read_num']=response.text.strip()
        return scrapy.Request(
            f'https://www.cnblogs.com/mvc/blog/GetComments.aspx?postId={postId}&blogApp={blogApp}&pageIndex=0&anchorCommentId=0&_={_}',
                              headers=self.headers, callback=self.parse_reply_num,
                              meta={'data': result})

    # 更多内容(评论数)的数据抓取
    def parse_reply_num(self,response):
        result = response.meta.get('data', {})
        result['reply_num']=json.loads(response.text.strip())['commentCount']
        return result